from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
import cv2
%matplotlib inline
img_files = [f for f in listdir('./train') if isfile(join('./train', f))]
img_data = defaultdict(list)
for img_file in img_files:
img_data[img_file[:3]].append(img_file)
img_data = pd.DataFrame(img_data)
img_data = pd.Series([img_data['cat'].count(), img_data['dog'].count()], index=['cat', 'dog'])
img_data.plot(kind='bar')
训练集中猫和狗的图片各占一半,分别为12500张。
from keras.applications.xception import Xception
from keras.applications.xception import preprocess_input as xception_preprocess
from keras.applications.xception import decode_predictions as xception_decode
from keras.applications.inception_v3 import InceptionV3
from keras.applications.inception_v3 import preprocess_input as inception_preprocess
from keras.applications.inception_v3 import decode_predictions as inception_decode
from keras.applications.resnet50 import ResNet50
from keras.applications.resnet50 import preprocess_input as resnet_preprocess
from keras.applications.resnet50 import decode_predictions as resnet_decode
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input as vgg16_preprocess
from keras.applications.vgg16 import decode_predictions as vgg16_decode
from keras.preprocessing import image
dogs = [
'n02085620','n02085782','n02085936','n02086079',
'n02086240','n02086646','n02086910','n02087046',
'n02087394','n02088094','n02088238','n02088364',
'n02088466','n02088632','n02089078','n02089867',
'n02089973','n02090379','n02090622','n02090721',
'n02091032','n02091134','n02091244','n02091467',
'n02091635','n02091831','n02092002','n02092339',
'n02093256','n02093428','n02093647','n02093754',
'n02093859','n02093991','n02094114','n02094258',
'n02094433','n02095314','n02095570','n02095889',
'n02096051','n02096177','n02096294','n02096437',
'n02096585','n02097047','n02097130','n02097209',
'n02097298','n02097474','n02097658','n02098105',
'n02098286','n02098413','n02099267','n02099429',
'n02099601','n02099712','n02099849','n02100236',
'n02100583','n02100735','n02100877','n02101006',
'n02101388','n02101556','n02102040','n02102177',
'n02102318','n02102480','n02102973','n02104029',
'n02104365','n02105056','n02105162','n02105251',
'n02105412','n02105505','n02105641','n02105855',
'n02106030','n02106166','n02106382','n02106550',
'n02106662','n02107142','n02107312','n02107574',
'n02107683','n02107908','n02108000','n02108089',
'n02108422','n02108551','n02108915','n02109047',
'n02109525','n02109961','n02110063','n02110185',
'n02110341','n02110627','n02110806','n02110958',
'n02111129','n02111277','n02111500','n02111889',
'n02112018','n02112137','n02112350','n02112706',
'n02113023','n02113186','n02113624','n02113712',
'n02113799','n02113978']
cats = [
'n02123045','n02123159','n02123394','n02123597',
'n02124075','n02125311','n02127052',
]
np.random.RandomState(42).shuffle(img_files)
def abnormals_detect(detector, img_files, img_size, top, preprocess, decode):
abnormals = []
for img_file in img_files:
img = image.load_img('./train/'+img_file, target_size=img_size)
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess(x)
preds = detector.predict(x)
preds = decode(preds, top=top)
dog_or_cat_found = False
for pred in preds[0]:
if pred[0] in dogs or pred[0] in cats:
dog_or_cat_found = True
if not dog_or_cat_found:
abnormals.append(img_file)
return abnormals
def display_image(img_paths):
for path in img_paths:
img = cv2.imread('./train/'+path)
cv_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
plt.imshow(cv_rgb)
plt.show()
xception_detector = Xception(weights='imagenet')
xception_abnormals = abnormals_detect(xception_detector, img_files[:1000], (299, 299), 5, xception_preprocess, xception_decode)
print('{} abnormals detected: {}'.format(len(xception_abnormals), xception_abnormals))
display_image(xception_abnormals)
根据Keras文档,准确率最高的模型是Xception,其Top-1和Top-5准确率分别为0.790和0.945。先以这个模型作为检测器,以1000张混洗过的图片作为样本,看看Top-5下都有哪些图片被识别为非猫非狗,检测出来有14张异常值,误报率较高,但的确也发现了 一些有问题的图片,比如有1张就是非猫非狗。下面将Top值提高到10看看。
xception_abnormals = abnormals_detect(xception_detector, img_files[:1000], (299, 299), 10, xception_preprocess, xception_decode)
print('{} abnormals detected: {}'.format(len(xception_abnormals), xception_abnormals))
display_image(xception_abnormals)
误报率仍然比较高,下面再分别试验下Top-20和Top-30。
xception_abnormals = abnormals_detect(xception_detector, img_files[:1000], (299, 299), 20, xception_preprocess, xception_decode)
print('{} abnormals detected: {}'.format(len(xception_abnormals), xception_abnormals))
display_image(xception_abnormals)
xception_abnormals = abnormals_detect(xception_detector, img_files[:1000], (299, 299), 30, xception_preprocess, xception_decode)
print('{} abnormals detected: {}'.format(len(xception_abnormals), xception_abnormals))
display_image(xception_abnormals)
从得到的结果可以看出Top值为30时检测到了1张图片,该图片是一个商标,既不是猫也不是狗,下面把样本数量提高到5000,再看看检测结果。
xception_abnormals = abnormals_detect(xception_detector, img_files[:5000], (299, 299), 30, xception_preprocess, xception_decode)
print('{} abnormals detected: {}'.format(len(xception_abnormals), xception_abnormals))
display_image(xception_abnormals)
上面5000个样本的检测结果显示的确有一些图片既不是猫也不是狗,但还是有一些正常的图片被误报了,不算太理想,把Top值提高到60,样本数量提高到10000再看看。
xception_abnormals = abnormals_detect(xception_detector, img_files[:10000], (299, 299), 60, xception_preprocess, xception_decode)
print('{} abnormals detected: {}'.format(len(xception_abnormals), xception_abnormals))
display_image(xception_abnormals)
从10000张图片中检测出来17张图片,其中有很多是非猫非狗的异常图片,虽然也有一部分图片被误报了,但是都属于图片内容比较复杂或者质量不高的,因此Top值为60是一个比较理想的值,下面结合4种模型,对整个训练集图片进行检测,然后将结果合并起来得到最终的异常值检测结果。
xception_abnormals = abnormals_detect(xception_detector, img_files, (299, 299), 60, xception_preprocess, xception_decode)
print('{} abnormals detected by Xception'.format(len(xception_abnormals)))
resnet_detector = ResNet50(weights='imagenet')
resnet_abnormals = abnormals_detect(resnet_detector, img_files, (224, 224), 60, resnet_preprocess, resnet_decode)
print('{} abnormals detected by ResNet'.format(len(resnet_abnormals)))
inception_detector = InceptionV3(weights='imagenet')
inception_abnormals = abnormals_detect(inception_detector, img_files, (299, 299), 60, inception_preprocess, inception_decode)
print('{} abnormals detected by Inception'.format(len(inception_abnormals)))
vgg16_detector = VGG16(weights='imagenet')
vgg16_abnormals = abnormals_detect(vgg16_detector, img_files, (224, 224), 60, vgg16_preprocess, vgg16_decode)
print('{} abnormals detected by VGG16'.format(len(vgg16_abnormals)))
total_abnormals = set().\
union(set(xception_abnormals)).\
union(set(resnet_abnormals)).\
union(set(inception_abnormals)).\
union(set(vgg16_abnormals))
print('{} abnormals detected: {}'.format(len(total_abnormals), list(total_abnormals)))
综合四种模型输出的结果得到118张图片,通过人工分析发现这些被判定为异常值的图片有以下几种类型:
我认为情况1应该保留,因为这里不能假定所有输入模型的图片都是清晰的,总会有一些模糊的图片,模型应当对这样的图片具有一定的健壮性;情况2也应该保留,模型也应该对一些复杂的图片具有一定的健壮性;情况3应该删除;情况4则比较主观,但我训练该模型的目的是为了识别真实世界的猫狗,所以我选择删除。最终被删除的有38张图片。
import os
deleted = [
'dog.10190.jpg', 'cat.7377.jpg', 'cat.5418.jpg', 'dog.8898.jpg', 'dog.10747.jpg',
'dog.6475.jpg', 'cat.11184.jpg', 'cat.4833.jpg', 'dog.1895.jpg', 'cat.10029.jpg',
'dog.1308.jpg', 'dog.8736.jpg', 'cat.12272.jpg', 'dog.11299.jpg', 'cat.7968.jpg',
'cat.2939.jpg', 'dog.4367.jpg', 'cat.5351.jpg', 'dog.10237.jpg', 'cat.8456.jpg',
'dog.5604.jpg', 'dog.1773.jpg', 'dog.3889.jpg', 'cat.9171.jpg', 'cat.4338.jpg',
'cat.7564.jpg', 'dog.9517.jpg', 'dog.12376.jpg', 'cat.4688.jpg', 'dog.9188.jpg',
'dog.10801.jpg', 'dog.1194.jpg', 'dog.10161.jpg', 'dog.1259.jpg', 'cat.10712.jpg',
'dog.2614.jpg', 'cat.8470.jpg', 'cat.3672.jpg',
]
for image in deleted:
os.remove('./train/'+image)
img_files = [f for f in listdir('./train') if isfile(join('./train', f))]
img_data = defaultdict(list)
for img_file in img_files:
img_data[img_file[:3]].append(img_file)
print('{} in total, cat {}, dog {}'.format(len(img_files), len(img_data['cat']), len(img_data['dog'])))
清洗后的训练集图片有24962张,其中猫有12483张,狗有12479张。
为了方便之后使用Keras的ImageDataGenerator,需要创建一个新文件夹trains,然后把猫和狗的照片分别放到其下的子文件夹cats和dogs中,为了节约磁盘空间,这里使用符号连接来实现。
import os
import shutil
image_files = os.listdir('./train')
cat_images = filter(lambda x: x[:3] == 'cat', image_files)
dog_images = filter(lambda x: x[:3] == 'dog', image_files)
def mkdir_from_scratch(dir_name):
if os.path.exists(dir_name):
shutil.rmtree(dir_name)
os.mkdir(dir_name)
mkdir_from_scratch('./trains')
os.mkdir('./trains/cats')
os.mkdir('./trains/dogs')
mkdir_from_scratch('./tests')
os.symlink('../test', './tests/test')
for image in cat_images:
os.symlink('../../train/'+image, './trains/cats/'+image)
for image in dog_images:
os.symlink('../../train/'+image, './trains/dogs/'+image)
from keras.layers.core import Lambda
from keras.layers import Input, GlobalAveragePooling2D
from keras.models import Model
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint
import h5py
def output_gap_features(model_class, image_size, preprocess_func, model_name):
# 获取图片的尺寸:长度和宽度
width = image_size[0]
height = image_size[1]
# 输入-预处理-输出(全局平均池化),构建作为特征向量抽取器的模型
x = Input((height, width, 3))
x = Lambda(preprocess_func)(x)
base_model = model_class(input_tensor=x, weights='imagenet', include_top=False)
model = Model(base_model.input, GlobalAveragePooling2D()(base_model.output))
#创建数据生成器
gen = ImageDataGenerator()
train_generator = gen.flow_from_directory('./trains', image_size, shuffle=False, batch_size=64)
test_generator = gen.flow_from_directory('./tests', image_size, shuffle=False, batch_size=64, class_mode=None)
#预测得到特征向量
train = model.predict_generator(train_generator, train_generator.samples//64+1, verbose=1)
test = model.predict_generator(test_generator, test_generator.samples//64+1, verbose=1)
#导出特征向量
with h5py.File('bottleneck_features_{}.h5'.format(model_name)) as h:
h.create_dataset('train', data=train)
h.create_dataset('test', data=test)
h.create_dataset('label', data=train_generator.classes)
print('output gap features for {} finished'.format(model_name))
# bottleneck features for ResNet
output_gap_features(ResNet50, (224, 224), resnet_preprocess, 'resnet')
# bottleneck features for Xception
output_gap_features(Xception, (299, 299), xception_preprocess, 'xception')
# bottleneck features for Inception
output_gap_features(InceptionV3, (299, 299), inception_preprocess, 'inception')
# bottleneck features for VGG 16
output_gap_features(VGG16, (224, 224), vgg16_preprocess, 'vgg16')
from sklearn.utils import shuffle
np.random.seed(42)
X_train = []
X_test = []
feature_files = [
'bottleneck_features_inception.h5',
'bottleneck_features_resnet.h5',
'bottleneck_features_vgg16.h5',
'bottleneck_features_xception.h5'
]
for h5_file in feature_files:
with h5py.File(h5_file, 'r') as h:
X_train.append(np.array(h['train']))
X_test.append(np.array(h['test']))
y_train = np.array(h['label'])
X_train = np.concatenate(X_train, axis=1)
X_test = np.concatenate(X_test, axis=1)
X_train, y_train = shuffle(X_train, y_train)
print(X_train.shape, X_test.shape, y_train.shape)
from keras.layers import Dense
from keras.layers import Dropout
from keras.optimizers import Adadelta
input_tensor = Input(X_train.shape[1:])
x = input_tensor
x = Dense(1, activation='sigmoid')(x)
model = Model(input_tensor, x)
model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['accuracy'])
hist = model.fit(X_train, y_train, batch_size=64, epochs=100, validation_split=0.2)
def plot_loss_and_accuracy(hist):
plt.figure().set_size_inches(18,6)
plt.subplot(1,2,1)
plt.plot(hist.history['loss'][5:],'b')
plt.plot(hist.history['val_loss'][5:],'r')
plt.ylabel('Loss')
plt.subplot(1,2,2)
plt.plot(hist.history['acc'][5:],'b')
plt.plot(hist.history['val_acc'][5:],'r')
plt.ylabel('Accuracy')
plt.show()
plot_loss_and_accuracy(hist)
首先,从验证集的Loss曲线(左图红色)可以看到呈上升趋势,说明模型过拟合了,需要防止过拟合,重新构建模型,增加Dropout层。
input_tensor = Input(X_train.shape[1:])
x = input_tensor
# p = 0.25, still overfitting
x = Dropout(0.5)(x)
x = Dense(1, activation='sigmoid')(x)
model = Model(input_tensor, x)
model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['accuracy'])
hist = model.fit(X_train, y_train, batch_size=64, epochs=100, validation_split=0.2)
plot_loss_and_accuracy(hist)
验证集的Loss曲线出现明显的震荡,需要进一步调参:减少学习率
input_tensor = Input(X_train.shape[1:])
x = input_tensor
# p = 0.25, still overfitting
x = Dropout(0.5)(x)
x = Dense(1, activation='sigmoid')(x)
model = Model(input_tensor, x)
# epochs=100 val loss仍在下降,alpha=1e-3,5e-3,1e-2略小,适当增大
model.compile(optimizer=Adadelta(lr=3e-2), loss='binary_crossentropy', metrics=['accuracy'])
check_pointer = ModelCheckpoint(filepath='best_model.h5', verbose=1, save_best_only=True)
hist = model.fit(X_train, y_train, batch_size=64, epochs=100, validation_split=0.2, callbacks=[check_pointer], verbose=1)
plot_loss_and_accuracy(hist)
model.load_weights('best_model.h5')
y_pred = model.predict(X_test, verbose=1)
y_pred = y_pred.clip(min=0.005, max=0.995)
df = pd.read_csv('sample_submission.csv')
gen = ImageDataGenerator()
test_generator = gen.flow_from_directory('./tests', (224, 224), shuffle=False, batch_size=16, class_mode=None)
for i, fname in enumerate(test_generator.filenames):
index = int(fname[fname.rfind('/')+1:fname.rfind('.')])
df.set_value(index-1, 'label', y_pred[i])
df.to_csv('pred.csv', index=None)
df.head(10)